setwd("C:/Users/CarlinML/DACSS-690R/Second_Deliverable")
getwd()
## [1] "C:/Users/CarlinML/DACSS-690R/Second_Deliverable"

FIRST DATASET - CLEANING

Social Media and Mental Health

folder="../dataFiles"
fileName="Social_Media_and_Mental_Health.csv"
fileToRead=file.path(folder,fileName)
fileToRead
## [1] "../dataFiles/Social_Media_and_Mental_Health.csv"
SMMH_dirty=read.csv(fileToRead,check.names=F)
SMMH_dirty

Remove leading and trailing spaces

SMMH_dirty[,]=sapply(SMMH_dirty[,],trimws)
SMMH_clean=SMMH_dirty[,]
View(SMMH_clean)
str(SMMH_clean)
## 'data.frame':    481 obs. of  21 variables:
##  $ Timestamp                                                                                                           : chr  "4/18/2022 19:18:47" "4/18/2022 19:19:28" "4/18/2022 19:25:59" "4/18/2022 19:29:43" ...
##  $ 1. What is your age?                                                                                                : chr  "21" "21" "21" "21" ...
##  $ 2. Gender                                                                                                           : chr  "Male" "Female" "Female" "Female" ...
##  $ 3. Relationship Status                                                                                              : chr  "In a relationship" "Single" "Single" "Single" ...
##  $ 4. Occupation Status                                                                                                : chr  "University Student" "University Student" "University Student" "University Student" ...
##  $ 5. What type of organizations are you affiliated with?                                                              : chr  "University" "University" "University" "University" ...
##  $ 6. Do you use social media?                                                                                         : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ 7. What social media platforms do you commonly use?                                                                 : chr  "Facebook, Twitter, Instagram, YouTube, Discord, Reddit" "Facebook, Twitter, Instagram, YouTube, Discord, Reddit" "Facebook, Instagram, YouTube, Pinterest" "Facebook, Instagram" ...
##  $ 8. What is the average time you spend on social media every day?                                                    : chr  "Between 2 and 3 hours" "More than 5 hours" "Between 3 and 4 hours" "More than 5 hours" ...
##  $ 9. How often do you find yourself using Social media without a specific purpose?                                    : chr  "5" "4" "3" "4" ...
##  $ 10. How often do you get distracted by Social media when you are busy doing something?                              : chr  "3" "3" "2" "2" ...
##  $ 11. Do you feel restless if you haven't used Social media in a while?                                               : chr  "2" "2" "1" "1" ...
##  $ 12. On a scale of 1 to 5, how easily distracted are you?                                                            : chr  "5" "4" "2" "3" ...
##  $ 13. On a scale of 1 to 5, how much are you bothered by worries?                                                     : chr  "2" "5" "5" "5" ...
##  $ 14. Do you find it difficult to concentrate on things?                                                              : chr  "5" "4" "4" "3" ...
##  $ 15. On a scale of 1-5, how often do you compare yourself to other successful people through the use of social media?: chr  "2" "5" "3" "5" ...
##  $ 16. Following the previous question, how do you feel about these comparisons, generally speaking?                   : chr  "3" "1" "3" "1" ...
##  $ 17. How often do you look to seek validation from features of social media?                                         : chr  "2" "1" "1" "2" ...
##  $ 18. How often do you feel depressed or down?                                                                        : chr  "5" "5" "4" "4" ...
##  $ 19. On a scale of 1 to 5, how frequently does your interest in daily activities fluctuate?                          : chr  "4" "4" "2" "3" ...
##  $ 20. On a scale of 1 to 5, how often do you face issues regarding sleep?                                             : chr  "5" "5" "5" "2" ...

Fix column names (variable names were too cumbersome when I tried replacing spaces with underscores, so I decided to use the following code to rename the variables instead)

library("dplyr")
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
SMMH_clean <- rename(SMMH_clean, 
  Age = "1. What is your age?",
  Gender = "2. Gender",
  RelStatus = "3. Relationship Status",
  OccStatus = "4. Occupation Status",
  OrgAffil = "5. What type of organizations are you affiliated with?",
  UseSocialMedia = "6. Do you use social media?",
  Platforms = "7. What social media platforms do you commonly use?",
  AmtTime = "8. What is the average time you spend on social media every day?",
  WoutPurpose = "9. How often do you find yourself using Social media without a specific purpose?",
  Distracted = "10. How often do you get distracted by Social media when you are busy doing something?",
  Restless = "11. Do you feel restless if you haven't used Social media in a while?",
  EasilyDist = "12. On a scale of 1 to 5, how easily distracted are you?",
  Bothered = "13. On a scale of 1 to 5, how much are you bothered by worries?",
  DiffConcen = "14. Do you find it difficult to concentrate on things?",
  Compare = "15. On a scale of 1-5, how often do you compare yourself to other successful people through the use of social media?",
  Comparisons = "16. Following the previous question, how do you feel about these comparisons, generally speaking?",
  Validation = "17. How often do you look to seek validation from features of social media?",
  Depressed = "18. How often do you feel depressed or down?",
  Interest = "19. On a scale of 1 to 5, how frequently does your interest in daily activities fluctuate?",
  Sleep = "20. On a scale of 1 to 5, how often do you face issues regarding sleep?")

Run frequency tables of categorical variables to see if any recoding is needed

table(SMMH_clean$Gender)
## 
##              Female                Male                  NB          Non-binary 
##                 263                 211                   1                   1 
##          Non binary           Nonbinary There are others???               Trans 
##                   1                   1                   1                   1 
##              unsure 
##                   1
SMMH_clean[SMMH_clean$Gender=='NB','Gender']='Non-binary'
SMMH_clean[SMMH_clean$Gender=='Non binary','Gender']='Non-binary'
SMMH_clean[SMMH_clean$Gender=='Nonbinary','Gender']='Non-binary'
SMMH_clean[SMMH_clean$Gender=='unsure','Gender']='NA'
SMMH_clean[SMMH_clean$Gender=='There are others???','Gender']='NA'
table(SMMH_clean$Gender)
## 
##     Female       Male         NA Non-binary      Trans 
##        263        211          2          4          1

Additional freq tables and mispelling correction

table(SMMH_clean$RelStatus) # no cleaning needed
## 
##          Divorced In a relationship           Married            Single 
##                 7                88               101               285
table(SMMH_clean$OccStatus) # no cleaning needed
## 
##            Retired    Salaried Worker     School Student University Student 
##                  8                132                 49                292
table(SMMH_clean$OrgAffil) # clean up N/As
## 
##                        Company               Company, Private 
##                             30                              7 
##                      Goverment                            N/A 
##                              6                             30 
##                        Private                         School 
##                             60                             44 
##                School, Company                    School, N/A 
##                              2                              2 
##                School, Private             School, University 
##                              1                              9 
##    School, University, Private                     University 
##                              4                            239 
##            University, Company University, Company, Goverment 
##                             19                              1 
##   University, Company, Private          University, Goverment 
##                              5                              2 
## University, Goverment, Private                University, N/A 
##                              1                              3 
##            University, Private 
##                             16
SMMH_clean$OrgAffil <- gsub("Goverment", "Government",
                              gsub("N/A", "NA", 
                              gsub(", N/A", "", SMMH_clean$OrgAffil)))
table(SMMH_clean$OrgAffil)
## 
##                         Company                Company, Private 
##                              30                               7 
##                      Government                              NA 
##                               6                              30 
##                         Private                          School 
##                              60                              46 
##                 School, Company                 School, Private 
##                               2                               1 
##              School, University     School, University, Private 
##                               9                               4 
##                      University             University, Company 
##                             242                              19 
## University, Company, Government    University, Company, Private 
##                               1                               5 
##          University, Government University, Government, Private 
##                               2                               1 
##             University, Private 
##                              16

Additional freq tables

table(SMMH_clean$UseSocialMedia) # no cleaning needed
## 
##  No Yes 
##   3 478
table(SMMH_clean$Platforms) # may need to clean but would be easier if data was formatted differently
## 
##                                                                             Discord 
##                                                                                   1 
##                                                                     Discord, Reddit 
##                                                                                   3 
##                                                                            Facebook 
##                                                                                  18 
##                                                                   Facebook, Discord 
##                                                                                   1 
##                                                           Facebook, Discord, Reddit 
##                                                                                   1 
##                                                                 Facebook, Instagram 
##                                                                                  10 
##                                                        Facebook, Instagram, Discord 
##                                                                                   2 
##                                                      Facebook, Instagram, Pinterest 
##                                                                                   1 
##                                                 Facebook, Instagram, Reddit, TikTok 
##                                                                                   1 
##                                                       Facebook, Instagram, Snapchat 
##                                                                                   3 
##                           Facebook, Instagram, Snapchat, Discord, Pinterest, TikTok 
##                                                                                   1 
##                                    Facebook, Instagram, Snapchat, Pinterest, TikTok 
##                                                                                   1 
##                                                         Facebook, Instagram, TikTok 
##                                                                                   2 
##                                                        Facebook, Instagram, YouTube 
##                                                                                  35 
##                                               Facebook, Instagram, YouTube, Discord 
##                                                                                  18 
##                                    Facebook, Instagram, YouTube, Discord, Pinterest 
##                                                                                   7 
##                            Facebook, Instagram, YouTube, Discord, Pinterest, TikTok 
##                                                                                   2 
##                                       Facebook, Instagram, YouTube, Discord, Reddit 
##                                                                                   4 
##                            Facebook, Instagram, YouTube, Discord, Reddit, Pinterest 
##                                                                                   2 
##                               Facebook, Instagram, YouTube, Discord, Reddit, TikTok 
##                                                                                   1 
##                                             Facebook, Instagram, YouTube, Pinterest 
##                                                                                  16 
##                                     Facebook, Instagram, YouTube, Pinterest, TikTok 
##                                                                                   2 
##                                                Facebook, Instagram, YouTube, Reddit 
##                                                                                   3 
##                                     Facebook, Instagram, YouTube, Reddit, Pinterest 
##                                                                                   1 
##                             Facebook, Instagram, YouTube, Reddit, Pinterest, TikTok 
##                                                                                   1 
##                                        Facebook, Instagram, YouTube, Reddit, TikTok 
##                                                                                   1 
##                                              Facebook, Instagram, YouTube, Snapchat 
##                                                                                  28 
##                                     Facebook, Instagram, YouTube, Snapchat, Discord 
##                                                                                  19 
##                          Facebook, Instagram, YouTube, Snapchat, Discord, Pinterest 
##                                                                                   4 
##                  Facebook, Instagram, YouTube, Snapchat, Discord, Pinterest, TikTok 
##                                                                                   5 
##                             Facebook, Instagram, YouTube, Snapchat, Discord, Reddit 
##                                                                                   4 
##                  Facebook, Instagram, YouTube, Snapchat, Discord, Reddit, Pinterest 
##                                                                                   5 
##          Facebook, Instagram, YouTube, Snapchat, Discord, Reddit, Pinterest, TikTok 
##                                                                                   5 
##                     Facebook, Instagram, YouTube, Snapchat, Discord, Reddit, TikTok 
##                                                                                   2 
##                             Facebook, Instagram, YouTube, Snapchat, Discord, TikTok 
##                                                                                   7 
##                                   Facebook, Instagram, YouTube, Snapchat, Pinterest 
##                                                                                   8 
##                           Facebook, Instagram, YouTube, Snapchat, Pinterest, TikTok 
##                                                                                   7 
##                   Facebook, Instagram, YouTube, Snapchat, Reddit, Pinterest, TikTok 
##                                                                                   2 
##                                      Facebook, Instagram, YouTube, Snapchat, TikTok 
##                                                                                   3 
##                                                Facebook, Instagram, YouTube, TikTok 
##                                                                                   4 
##                                                          Facebook, Snapchat, Reddit 
##                                                                                   1 
##                                                                    Facebook, TikTok 
##                                                                                   1 
##                                                                   Facebook, Twitter 
##                                                                                   1 
##                                      Facebook, Twitter, Instagram, Snapchat, Reddit 
##                                                                                   1 
##                                               Facebook, Twitter, Instagram, YouTube 
##                                                                                  14 
##                                      Facebook, Twitter, Instagram, YouTube, Discord 
##                                                                                   3 
##                           Facebook, Twitter, Instagram, YouTube, Discord, Pinterest 
##                                                                                   2 
##                   Facebook, Twitter, Instagram, YouTube, Discord, Pinterest, TikTok 
##                                                                                   1 
##                              Facebook, Twitter, Instagram, YouTube, Discord, Reddit 
##                                                                                   4 
##                   Facebook, Twitter, Instagram, YouTube, Discord, Reddit, Pinterest 
##                                                                                   2 
##           Facebook, Twitter, Instagram, YouTube, Discord, Reddit, Pinterest, TikTok 
##                                                                                   1 
##                      Facebook, Twitter, Instagram, YouTube, Discord, Reddit, TikTok 
##                                                                                   2 
##                                    Facebook, Twitter, Instagram, YouTube, Pinterest 
##                                                                                   7 
##                                       Facebook, Twitter, Instagram, YouTube, Reddit 
##                                                                                   3 
##                            Facebook, Twitter, Instagram, YouTube, Reddit, Pinterest 
##                                                                                   3 
##                    Facebook, Twitter, Instagram, YouTube, Reddit, Pinterest, TikTok 
##                                                                                   1 
##                                     Facebook, Twitter, Instagram, YouTube, Snapchat 
##                                                                                   5 
##                            Facebook, Twitter, Instagram, YouTube, Snapchat, Discord 
##                                                                                   4 
##                 Facebook, Twitter, Instagram, YouTube, Snapchat, Discord, Pinterest 
##                                                                                   5 
##         Facebook, Twitter, Instagram, YouTube, Snapchat, Discord, Pinterest, TikTok 
##                                                                                   4 
##                    Facebook, Twitter, Instagram, YouTube, Snapchat, Discord, Reddit 
##                                                                                   8 
##         Facebook, Twitter, Instagram, YouTube, Snapchat, Discord, Reddit, Pinterest 
##                                                                                   6 
## Facebook, Twitter, Instagram, YouTube, Snapchat, Discord, Reddit, Pinterest, TikTok 
##                                                                                  11 
##            Facebook, Twitter, Instagram, YouTube, Snapchat, Discord, Reddit, TikTok 
##                                                                                   1 
##                    Facebook, Twitter, Instagram, YouTube, Snapchat, Discord, TikTok 
##                                                                                   5 
##                          Facebook, Twitter, Instagram, YouTube, Snapchat, Pinterest 
##                                                                                   6 
##                  Facebook, Twitter, Instagram, YouTube, Snapchat, Reddit, Pinterest 
##                                                                                   1 
##                     Facebook, Twitter, Instagram, YouTube, Snapchat, Reddit, TikTok 
##                                                                                   1 
##                             Facebook, Twitter, Instagram, YouTube, Snapchat, TikTok 
##                                                                                   3 
##                                                          Facebook, Twitter, YouTube 
##                                                                                   1 
##                                                 Facebook, Twitter, YouTube, Discord 
##                                                                                   1 
##                                      Facebook, Twitter, YouTube, Discord, Pinterest 
##                                                                                   1 
##                                         Facebook, Twitter, YouTube, Discord, Reddit 
##                                                                                   1 
##                                               Facebook, Twitter, YouTube, Pinterest 
##                                                                                   1 
##                                                                   Facebook, YouTube 
##                                                                                  30 
##                                                          Facebook, YouTube, Discord 
##                                                                                   6 
##                                               Facebook, YouTube, Discord, Pinterest 
##                                                                                   2 
##                                                  Facebook, YouTube, Discord, Reddit 
##                                                                                   5 
##                                       Facebook, YouTube, Discord, Reddit, Pinterest 
##                                                                                   1 
##                                                        Facebook, YouTube, Pinterest 
##                                                                                   5 
##                                                           Facebook, YouTube, Reddit 
##                                                                                   2 
##                                                         Facebook, YouTube, Snapchat 
##                                                                                   2 
##                                                Facebook, YouTube, Snapchat, Discord 
##                                                                                   1 
##                                              Facebook, YouTube, Snapchat, Pinterest 
##                                                                                   1 
##                                                           Facebook, YouTube, TikTok 
##                                                                                   3 
##                                                                           Instagram 
##                                                                                   5 
##                                                                  Instagram, Discord 
##                                                                                   1 
##                                       Instagram, Discord, Reddit, Pinterest, TikTok 
##                                                                                   1 
##                                                                   Instagram, Reddit 
##                                                                                   1 
##                                                                  Instagram, YouTube 
##                                                                                   3 
##                                                         Instagram, YouTube, Discord 
##                                                                                   2 
##                                              Instagram, YouTube, Discord, Pinterest 
##                                                                                   1 
##                                                 Instagram, YouTube, Discord, Reddit 
##                                                                                   2 
##                                      Instagram, YouTube, Discord, Reddit, Pinterest 
##                                                                                   1 
##                                         Instagram, YouTube, Discord, Reddit, TikTok 
##                                                                                   1 
##                                                       Instagram, YouTube, Pinterest 
##                                                                                   1 
##                                               Instagram, YouTube, Reddit, Pinterest 
##                                                                                   1 
##                                       Instagram, YouTube, Snapchat, Discord, Reddit 
##                                                                                   2 
##                    Instagram, YouTube, Snapchat, Discord, Reddit, Pinterest, TikTok 
##                                                                                   1 
##                                             Instagram, YouTube, Snapchat, Pinterest 
##                                                                                   1 
##                                                                           Pinterest 
##                                                                                   2 
##                                                                              Reddit 
##                                                                                   4 
##                                                                   Reddit, Pinterest 
##                                                                                   1 
##                                                                              TikTok 
##                                                                                   1 
##                                                                             Twitter 
##                                                                                   1 
##                                                            Twitter, Discord, Reddit 
##                                                                                   2 
##                                                          Twitter, Instagram, TikTok 
##                                                                                   1 
##                                                         Twitter, Instagram, YouTube 
##                                                                                   3 
##                                     Twitter, Instagram, YouTube, Discord, Pinterest 
##                                                                                   1 
##                                         Twitter, Instagram, YouTube, Reddit, TikTok 
##                                                                                   3 
##                              Twitter, Instagram, YouTube, Snapchat, Discord, Reddit 
##                                                                                   1 
##           Twitter, Instagram, YouTube, Snapchat, Discord, Reddit, Pinterest, TikTok 
##                                                                                   1 
##                      Twitter, Instagram, YouTube, Snapchat, Discord, Reddit, TikTok 
##                                                                                   2 
##                    Twitter, Instagram, YouTube, Snapchat, Reddit, Pinterest, TikTok 
##                                                                                   1 
##                                                 Twitter, Instagram, YouTube, TikTok 
##                                                                                   1 
##                                                                    Twitter, YouTube 
##                                                                                   2 
##                                                   Twitter, YouTube, Discord, Reddit 
##                                                                                   1 
##                                                            Twitter, YouTube, Reddit 
##                                                                                   1 
##                                                                             YouTube 
##                                                                                   6 
##                                                                    YouTube, Discord 
##                                                                                   1 
##                                                            YouTube, Discord, Reddit 
##                                                                                   4 
##                                                                  YouTube, Pinterest 
##                                                                                   1 
##                                                                     YouTube, Reddit 
##                                                                                   3 
##                                                          YouTube, Snapchat, Discord 
##                                                                                   1 
##                                                  YouTube, Snapchat, Discord, Reddit 
##                                                                                   1
table(SMMH_clean$AmtTime) # no cleaning needed
## 
## Between 1 and 2 hours Between 2 and 3 hours Between 3 and 4 hours 
##                    70                   101                    93 
## Between 4 and 5 hours     Less than an Hour     More than 5 hours 
##                    67                    34                   116

Save cleaned file to new folder

folder <- "DataCleanAndFormatted"

# Check if the folder exists
if (!dir.exists(folder)) {
  # Create the folder
  dir.create(folder)
  write.csv(SMMH_clean,file.path(folder,"SMMH_clean.csv"),row.names = F)

} else {
  write.csv(SMMH_clean,file.path(folder,"SMMH_clean.csv"),row.names = F)}

FIRST DATASET - FORMATTING

linkSMMH_clean='https://github.com/DACSS-690R/Second_Deliverable/raw/refs/heads/main/DataCleanAndFormatted/SMMH_clean.csv'
SMMH_clean=read.csv(linkSMMH_clean)
str(SMMH_clean)
## 'data.frame':    481 obs. of  21 variables:
##  $ Timestamp     : chr  "4/18/2022 19:18:47" "4/18/2022 19:19:28" "4/18/2022 19:25:59" "4/18/2022 19:29:43" ...
##  $ Age           : num  21 21 21 21 21 22 21 21 21 20 ...
##  $ Gender        : chr  "Male" "Female" "Female" "Female" ...
##  $ RelStatus     : chr  "In a relationship" "Single" "Single" "Single" ...
##  $ OccStatus     : chr  "University Student" "University Student" "University Student" "University Student" ...
##  $ OrgAffil      : chr  "University" "University" "University" "University" ...
##  $ UseSocialMedia: chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Platforms     : chr  "Facebook, Twitter, Instagram, YouTube, Discord, Reddit" "Facebook, Twitter, Instagram, YouTube, Discord, Reddit" "Facebook, Instagram, YouTube, Pinterest" "Facebook, Instagram" ...
##  $ AmtTime       : chr  "Between 2 and 3 hours" "More than 5 hours" "Between 3 and 4 hours" "More than 5 hours" ...
##  $ WoutPurpose   : int  5 4 3 4 3 4 4 5 5 1 ...
##  $ Distracted    : int  3 3 2 2 5 4 3 2 2 1 ...
##  $ Restless      : int  2 2 1 1 4 2 2 3 3 1 ...
##  $ EasilyDist    : int  5 4 2 3 4 3 2 3 3 1 ...
##  $ Bothered      : int  2 5 5 5 5 4 4 3 1 1 ...
##  $ DiffConcen    : int  5 4 4 3 5 3 3 1 1 1 ...
##  $ Compare       : int  2 5 3 5 3 4 5 1 1 1 ...
##  $ Comparisons   : int  3 1 3 1 3 4 3 3 3 1 ...
##  $ Validation    : int  2 1 1 2 3 3 4 1 1 1 ...
##  $ Depressed     : int  5 5 4 4 4 3 5 5 5 1 ...
##  $ Interest      : int  4 4 2 3 4 2 5 5 5 1 ...
##  $ Sleep         : int  5 5 5 2 1 4 3 1 1 1 ...

Nominal variables - create new columns that are factors

SMMH_clean$Gender_label <- as.factor(SMMH_clean$Gender)
SMMH_clean$RelStatus_label <- as.factor(SMMH_clean$RelStatus)
SMMH_clean$OccStatus_label <- as.factor(SMMH_clean$OccStatus)
SMMH_clean$OrgAffil_label <- as.factor(SMMH_clean$OrgAffil)
SMMH_clean$UseSocialMedia_label <- as.factor(SMMH_clean$UseSocialMedia)
str(SMMH_clean)
## 'data.frame':    481 obs. of  26 variables:
##  $ Timestamp           : chr  "4/18/2022 19:18:47" "4/18/2022 19:19:28" "4/18/2022 19:25:59" "4/18/2022 19:29:43" ...
##  $ Age                 : num  21 21 21 21 21 22 21 21 21 20 ...
##  $ Gender              : chr  "Male" "Female" "Female" "Female" ...
##  $ RelStatus           : chr  "In a relationship" "Single" "Single" "Single" ...
##  $ OccStatus           : chr  "University Student" "University Student" "University Student" "University Student" ...
##  $ OrgAffil            : chr  "University" "University" "University" "University" ...
##  $ UseSocialMedia      : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Platforms           : chr  "Facebook, Twitter, Instagram, YouTube, Discord, Reddit" "Facebook, Twitter, Instagram, YouTube, Discord, Reddit" "Facebook, Instagram, YouTube, Pinterest" "Facebook, Instagram" ...
##  $ AmtTime             : chr  "Between 2 and 3 hours" "More than 5 hours" "Between 3 and 4 hours" "More than 5 hours" ...
##  $ WoutPurpose         : int  5 4 3 4 3 4 4 5 5 1 ...
##  $ Distracted          : int  3 3 2 2 5 4 3 2 2 1 ...
##  $ Restless            : int  2 2 1 1 4 2 2 3 3 1 ...
##  $ EasilyDist          : int  5 4 2 3 4 3 2 3 3 1 ...
##  $ Bothered            : int  2 5 5 5 5 4 4 3 1 1 ...
##  $ DiffConcen          : int  5 4 4 3 5 3 3 1 1 1 ...
##  $ Compare             : int  2 5 3 5 3 4 5 1 1 1 ...
##  $ Comparisons         : int  3 1 3 1 3 4 3 3 3 1 ...
##  $ Validation          : int  2 1 1 2 3 3 4 1 1 1 ...
##  $ Depressed           : int  5 5 4 4 4 3 5 5 5 1 ...
##  $ Interest            : int  4 4 2 3 4 2 5 5 5 1 ...
##  $ Sleep               : int  5 5 5 2 1 4 3 1 1 1 ...
##  $ Gender_label        : Factor w/ 4 levels "Female","Male",..: 2 1 1 1 1 1 1 1 1 2 ...
##  $ RelStatus_label     : Factor w/ 4 levels "Divorced","In a relationship",..: 2 4 4 4 4 4 3 2 2 4 ...
##  $ OccStatus_label     : Factor w/ 4 levels "Retired","Salaried Worker",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ OrgAffil_label      : Factor w/ 16 levels "Company","Company, Private",..: 10 10 10 10 10 10 10 10 10 10 ...
##  $ UseSocialMedia_label: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...

Ordinal ‘amount of time’ variable

# map for replacement: 1 the lowest # hours / 6 the highest # hours
map_AmtTime <- c("Less than an Hour"=1, "Between 1 and 2 hours"=2,"Between 2 and 3 hours"=3, "Between 3 and 4 hours"=4, "Between 4 and 5 hours"=5, "More than 5 hours"=6)

SMMH_clean$AmtTime_int=map_AmtTime[SMMH_clean$AmtTime]

SMMH_clean$AmtTime_label <- factor(SMMH_clean$AmtTime_int, levels = seq(1,6),labels = c('1_Less than an Hour','2_Between 1 and 2 hours','3_Between 2 and 3 hours','4_Between 3 and 4 hours','5_Between 4 and 5 hours', '6_More than 5 hours'),ordered = TRUE)

Ordinal likert scale items

theInts=seq(1,5) 
theLabels=c('1_Strongly Disagree','2_Disagree','3_Neutral','4_Agree','5_Strongly Agree') 

FormatOrdinal=function(col) factor(col,
                                   levels = theInts,
                                   labels = theLabels,
                                   ordered = TRUE)

names <- c(10:21)
SMMH_clean[names]=lapply(SMMH_clean[names],FormatOrdinal)

str(SMMH_clean)
## 'data.frame':    481 obs. of  28 variables:
##  $ Timestamp           : chr  "4/18/2022 19:18:47" "4/18/2022 19:19:28" "4/18/2022 19:25:59" "4/18/2022 19:29:43" ...
##  $ Age                 : num  21 21 21 21 21 22 21 21 21 20 ...
##  $ Gender              : chr  "Male" "Female" "Female" "Female" ...
##  $ RelStatus           : chr  "In a relationship" "Single" "Single" "Single" ...
##  $ OccStatus           : chr  "University Student" "University Student" "University Student" "University Student" ...
##  $ OrgAffil            : chr  "University" "University" "University" "University" ...
##  $ UseSocialMedia      : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Platforms           : chr  "Facebook, Twitter, Instagram, YouTube, Discord, Reddit" "Facebook, Twitter, Instagram, YouTube, Discord, Reddit" "Facebook, Instagram, YouTube, Pinterest" "Facebook, Instagram" ...
##  $ AmtTime             : chr  "Between 2 and 3 hours" "More than 5 hours" "Between 3 and 4 hours" "More than 5 hours" ...
##  $ WoutPurpose         : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 5 4 3 4 3 4 4 5 5 1 ...
##  $ Distracted          : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 3 3 2 2 5 4 3 2 2 1 ...
##  $ Restless            : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 2 2 1 1 4 2 2 3 3 1 ...
##  $ EasilyDist          : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 5 4 2 3 4 3 2 3 3 1 ...
##  $ Bothered            : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 2 5 5 5 5 4 4 3 1 1 ...
##  $ DiffConcen          : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 5 4 4 3 5 3 3 1 1 1 ...
##  $ Compare             : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 2 5 3 5 3 4 5 1 1 1 ...
##  $ Comparisons         : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 3 1 3 1 3 4 3 3 3 1 ...
##  $ Validation          : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 2 1 1 2 3 3 4 1 1 1 ...
##  $ Depressed           : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 5 5 4 4 4 3 5 5 5 1 ...
##  $ Interest            : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 4 4 2 3 4 2 5 5 5 1 ...
##  $ Sleep               : Ord.factor w/ 5 levels "1_Strongly Disagree"<..: 5 5 5 2 1 4 3 1 1 1 ...
##  $ Gender_label        : Factor w/ 4 levels "Female","Male",..: 2 1 1 1 1 1 1 1 1 2 ...
##  $ RelStatus_label     : Factor w/ 4 levels "Divorced","In a relationship",..: 2 4 4 4 4 4 3 2 2 4 ...
##  $ OccStatus_label     : Factor w/ 4 levels "Retired","Salaried Worker",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ OrgAffil_label      : Factor w/ 16 levels "Company","Company, Private",..: 10 10 10 10 10 10 10 10 10 10 ...
##  $ UseSocialMedia_label: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 1 ...
##  $ AmtTime_int         : num  3 6 4 6 3 3 4 6 6 1 ...
##  $ AmtTime_label       : Ord.factor w/ 6 levels "1_Less than an Hour"<..: 3 6 4 6 3 3 4 6 6 1 ...

Save as RDS

folder = "DataCleanAndFormatted"

# Check if the folder exists
if (!dir.exists(folder)) {
  # Create the folder
  dir.create(folder)
  saveRDS(SMMH_clean,file.path(folder,"SMMH_formatted.RDS"))

} else {
  saveRDS(SMMH_clean,file.path(folder,"SMMH_formatted.RDS"))

}

SECOND DATASET - CLEANING

Clerkship grades; need to skip first line in excel file

library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
folder="../dataFiles"
fileName="UMassChan_ClerkshipGrades.xlsx"
fileToRead=file.path(folder,fileName)
fileToRead
## [1] "../dataFiles/UMassChan_ClerkshipGrades.xlsx"
ClerkshipGrades_dirty=read_xlsx(fileToRead, skip=1)
ClerkshipGrades_dirty

Remove leading and trailing spaces

ClerkshipGrades_dirty[,]=sapply(ClerkshipGrades_dirty[,],trimws)
ClerkshipGrades_clean=ClerkshipGrades_dirty[,]
View(ClerkshipGrades_clean)
str(ClerkshipGrades_clean)
## tibble [6,427 × 14] (S3: tbl_df/tbl/data.frame)
##  $ Term                : chr [1:6427] "5300" "5301" "5301" "5301" ...
##  $ Student_ID          : chr [1:6427] "1793" "1793" "1793" "1793" ...
##  $ Subject             : chr [1:6427] "OB" "PS" "FC" "PE" ...
##  $ Catalog             : chr [1:6427] "300" "300" "300" "300" ...
##  $ Session             : chr [1:6427] "S1C" "S3A" "S3B" "S3C" ...
##  $ Location            : chr [1:6427] "Cape Cod Hospital" "TaraVista Behavioral Health Center" "Worcester Area South" "UMMHC-University Campus" ...
##  $ Final_Letter        : chr [1:6427] "H" "HH" "H" "HH" ...
##  $ Final_Numeric       : chr [1:6427] "83" "91" "92.3" "3.7" ...
##  $ SPE_Letter          : chr [1:6427] "HONORS" "High Honors" "High Honors" "High Honors" ...
##  $ SPE_Numeric         : chr [1:6427] "37" "96" "43.75" "3.55" ...
##  $ NBME-Written_Letter : chr [1:6427] "Honors" "Pass" "Honors" "Honors" ...
##  $ NBME-Written_Numeric: chr [1:6427] "11" "78" "18" "87" ...
##  $ OSCE_Letter         : chr [1:6427] "Honors" "High Honors" "Honors" "High Honors" ...
##  $ OSCE_Numeric        : chr [1:6427] "20" "93" "16.45" "93" ...

Remove dashes from two variable names

ClerkshipGrades_clean <- rename(ClerkshipGrades_clean, 
  NBME_Letter = `NBME-Written_Letter`,
  NBME_Numeric = `NBME-Written_Numeric`)

Run frequency tables of categorical variables to see if any recoding is needed

table(ClerkshipGrades_clean$Subject) # no cleaning needed
## 
##  FC  ME  NU  OB  PE  PS  SU 
## 942 790 934 960 938 938 925
table(ClerkshipGrades_clean$Catalog) # no cleaning needed 
## 
##  300 300B  302 
## 4703  790  934
table(ClerkshipGrades_clean$Session) # no cleaning needed
## 
## 1A4 1A5 1B3 1B4 1B5 1C3 1C5 2A4 2A5 2B3 2B5 2C3 2C5 3A3 3A4 3A5 3B3 3B4 3B5 3C3 
##  88 143  54  32 152  84 141  93 151  88 137  85 146  35  52 129  47  33 123  81 
## 3C5 S1A S1B S1C S2A S2B S2C S3A S3B S3C 
## 115 477 504 523 466 470 499 491 477 511
table(ClerkshipGrades_clean$Location) # no cleaning needed
## 
##                                  Baystate Medical Center 
##                                                      269 
##                Baystate Medical Center - Springfield, MA 
##                                                      809 
##                                           Berkshire Area 
##                                                        3 
##                                 Berkshire Medical Center 
##                                                      373 
##                                              Boston Area 
##                                                       14 
##                                        Cape Cod Hospital 
##                                                      495 
##                               Charlton Memorial Hospital 
##                                                        2 
##                     Dr. J. Corrigan Mental Health Center 
##                                                       51 
##             Emergency Mental Health Service/Memorial C/L 
##                                                        1 
##                                        Falmouth Hospital 
##                                                       43 
##                                    Holyoke Health Center 
##                                                       16 
##                         Hospital for Behavioral Medicine 
##                                                       19 
##                          Lahey Hospital & Medical Center 
##                                                       51 
##                                         Middlesex County 
##                                                       17 
##                          Milford Regional Medical Center 
##                                                      410 
##                                      Pioneer Valley Area 
##                                                       10 
##                            Pocasset Mental Health Center 
##                                                        8 
##                     Saint Vincent Hospital Massachusetts 
##                                                      506 
##                                           Southeast Area 
##                                                       28 
##                                      St. Luke's Hospital 
##                                                        4 
##                       TaraVista Behavioral Health Center 
##                                                      113 
##                                   Taunton State Hospital 
##                                                       22 
##                                             UMMHC-8 East 
##                                                      101 
##                                                UMMHC-C/L 
##                                                      115 
##                                    UMMHC-C/L (Addiction) 
##                                                        2 
##                                    UMMHC-C/L (Pediatric) 
##                                                        3 
##                               UMMHC-Marlborough Hospital 
##                                                       88 
##                                    UMMHC-Memorial Campus 
##                                                      632 
##                                UMMHC-Memorial Colorectal 
##                                                       72 
##                                       UMMHC-Memorial MIS 
##                                                       42 
##                         UMMHC-Memorial Surgical Oncology 
##                                                       42 
##   UMMHC-Psychiatric Treatment and Recovery Center (PTRC) 
##                                                       28 
##                     UMMHC-University & Memorial Campuses 
##                                                       98 
##                                     UMMHC-University ACS 
##                                                       44 
##                                  UMMHC-University Campus 
##                                                     1020 
##                                  UMMHC-University Haidak 
##                                                       42 
##                                     Worcester Area North 
##                                                      304 
##                                     Worcester Area South 
##                                                      449 
## Worcester Recovery Center and Hospital - Adolescent Unit 
##                                                       29 
##      Worcester Recovery Center and Hospital - Adult Unit 
##                                                       52
table(ClerkshipGrades_clean$Final_Letter)
## 
##    F    H   HH    I    P    S 
##    3 3782 2221    5  305  111
ClerkshipGrades_clean <- ClerkshipGrades_clean %>%
  mutate(Final_Letter2 = recode(Final_Letter,
                                  "HH" = "High Honors",
                                  "H" = "Honors",
                                  "P" = "Pass",
                                  "S" = "Satisfactory",
                                  "F" = "Fail",
                                  "I" = "Incomplete"))
table(ClerkshipGrades_clean$Final_Letter2)
## 
##         Fail  High Honors       Honors   Incomplete         Pass Satisfactory 
##            3         2221         3782            5          305          111
table(ClerkshipGrades_clean$SPE_Letter) 
## 
## High Honors      Honors      HONORS  Incomplete        Pass 
##        3853        2060           4           1         133
ClerkshipGrades_clean <- ClerkshipGrades_clean %>%
  mutate(SPE_Letter2 = recode(SPE_Letter,
                                  "HONORS" = "Honors"))
table(ClerkshipGrades_clean$SPE_Letter2)
## 
## High Honors      Honors  Incomplete        Pass 
##        3853        2064           1         133
table(ClerkshipGrades_clean$NBME_Letter)
## 
##         Fail  High Honors  HIGH HONORS       Honors       HONORS   Incomplete 
##            3         1183           11         2437           15            5 
##         Pass         PASS Satisfactory 
##         1805           17           37
ClerkshipGrades_clean <- ClerkshipGrades_clean %>%
  mutate(NBME_Letter2 = recode(NBME_Letter,
                                  "HIGH HONORS" = "High Honors",
                                  "HONORS" = "Honors",
                                  "PASS" = "Pass"))
table(ClerkshipGrades_clean$NBME_Letter2)
## 
##         Fail  High Honors       Honors   Incomplete         Pass Satisfactory 
##            3         1194         2452            5         1822           37
table(ClerkshipGrades_clean$OSCE_Letter)
## 
##         Fail  High Honors  HIGH HONORS       Honors       HONORS   Incomplete 
##            3         2199            1         2964            8            1 
##         Pass         PASS Satisfactory 
##         1187            9           54
ClerkshipGrades_clean <- ClerkshipGrades_clean %>%
  mutate(OSCE_Letter2 = recode(OSCE_Letter,
                                  "HIGH HONORS" = "High Honors",
                                  "HONORS" = "Honors",
                                  "PASS" = "Pass"))
table(ClerkshipGrades_clean$OSCE_Letter2)
## 
##         Fail  High Honors       Honors   Incomplete         Pass Satisfactory 
##            3         2200         2972            1         1196           54

Cleaning numeric variables

ClerkshipGrades_clean[!complete.cases(ClerkshipGrades_clean),] 
ClerkshipGrades_clean[1,]
colSums(is.na(apply(ClerkshipGrades_clean[,c(10,12,14,16)],2, as.numeric)))
## Warning in apply(ClerkshipGrades_clean[, c(10, 12, 14, 16)], 2, as.numeric):
## NAs introduced by coercion
## Warning in apply(ClerkshipGrades_clean[, c(10, 12, 14, 16)], 2, as.numeric):
## NAs introduced by coercion
## Warning in apply(ClerkshipGrades_clean[, c(10, 12, 14, 16)], 2, as.numeric):
## NAs introduced by coercion
##  SPE_Numeric NBME_Numeric OSCE_Numeric  SPE_Letter2 
##          377           44            6         6427
detectWrongNA= function(col){col[grep("[^\\d+\\.*\\d*]", col, perl=T,invert = F)]}
badSymbolNum=sapply(ClerkshipGrades_clean[, c('Final_Numeric','SPE_Numeric','NBME_Numeric', 'OSCE_Numeric')],detectWrongNA)
badSymbolNum_unlist=unlist(badSymbolNum)
badSymbolNum_vector=unique(badSymbolNum_unlist)
badSymbolNum_vector
## [1] "4 (7" "4 (9" "4 (8" "11,4" "12,7"
ClerkshipGrades_clean[, c('Final_Numeric','SPE_Numeric','NBME_Numeric', 'OSCE_Numeric')]=lapply(ClerkshipGrades_clean[, c('Final_Numeric','SPE_Numeric','NBME_Numeric', 'OSCE_Numeric')],function(col) ifelse((col %in% badSymbolNum_vector), NA, col))

ClerkshipGrades_clean

Save cleaned file to new folder

folder <- "DataCleanAndFormatted"

# Check if the folder exists
if (!dir.exists(folder)) {
  # Create the folder
  dir.create(folder)
  write.csv(SMMH_clean,file.path(folder,"ClerkshipGrades_clean.csv"))

} else {
  write.csv(ClerkshipGrades_clean,file.path(folder,"ClerkshipGrades_clean.csv"))}

SECOND DATASET - FORMATTING

linkClerkshipGrades_clean='https://github.com/DACSS-690R/Second_Deliverable/raw/refs/heads/main/DataCleanAndFormatted/ClerkshipGrades_clean.csv'
ClerkshipGrades_clean=read.csv(linkClerkshipGrades_clean)
str(ClerkshipGrades_clean)
## 'data.frame':    6427 obs. of  19 variables:
##  $ X            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Term         : int  5300 5301 5301 5301 5300 5300 5300 5000 5001 5001 ...
##  $ Student_ID   : int  1793 1793 1793 1793 1793 1793 1793 1379 1379 1379 ...
##  $ Subject      : chr  "OB" "PS" "FC" "PE" ...
##  $ Catalog      : chr  "300" "300" "300" "300" ...
##  $ Session      : chr  "S1C" "S3A" "S3B" "S3C" ...
##  $ Location     : chr  "Cape Cod Hospital" "TaraVista Behavioral Health Center" "Worcester Area South" "UMMHC-University Campus" ...
##  $ Final_Letter : chr  "H" "HH" "H" "HH" ...
##  $ Final_Numeric: num  83 91 92.3 3.7 3 3.3 95.8 NA NA NA ...
##  $ SPE_Letter   : chr  "HONORS" "High Honors" "High Honors" "High Honors" ...
##  $ SPE_Numeric  : num  37 96 43.75 3.55 3.6 ...
##  $ NBME_Letter  : chr  "Honors" "Pass" "Honors" "Honors" ...
##  $ NBME_Numeric : num  11 78 18 87 67 2.1 75 11 82 12.1 ...
##  $ OSCE_Letter  : chr  "Honors" "High Honors" "Honors" "High Honors" ...
##  $ OSCE_Numeric : num  20 93 16.4 93 62 ...
##  $ Final_Letter2: chr  "Honors" "High Honors" "Honors" "High Honors" ...
##  $ SPE_Letter2  : chr  "Honors" "High Honors" "High Honors" "High Honors" ...
##  $ NBME_Letter2 : chr  "Honors" "Pass" "Honors" "Honors" ...
##  $ OSCE_Letter2 : chr  "Honors" "High Honors" "Honors" "High Honors" ...

Change all Letter Grade columns to uppercase, and conver to ordered factors

ClerkshipGrades_clean[,16:19] <- lapply(ClerkshipGrades_clean[,16:19],toupper)


Likert_cols <- c(16:19)
ClerkshipGrades_clean[,Likert_cols] <- lapply(ClerkshipGrades_clean[,Likert_cols] , factor, ordered = TRUE, levels = c("FAIL", "INCOMPLETE", "SATISFACTORY", "PASS", "HONORS", "HIGH HONORS"))
str(ClerkshipGrades_clean)
## 'data.frame':    6427 obs. of  19 variables:
##  $ X            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Term         : int  5300 5301 5301 5301 5300 5300 5300 5000 5001 5001 ...
##  $ Student_ID   : int  1793 1793 1793 1793 1793 1793 1793 1379 1379 1379 ...
##  $ Subject      : chr  "OB" "PS" "FC" "PE" ...
##  $ Catalog      : chr  "300" "300" "300" "300" ...
##  $ Session      : chr  "S1C" "S3A" "S3B" "S3C" ...
##  $ Location     : chr  "Cape Cod Hospital" "TaraVista Behavioral Health Center" "Worcester Area South" "UMMHC-University Campus" ...
##  $ Final_Letter : chr  "H" "HH" "H" "HH" ...
##  $ Final_Numeric: num  83 91 92.3 3.7 3 3.3 95.8 NA NA NA ...
##  $ SPE_Letter   : chr  "HONORS" "High Honors" "High Honors" "High Honors" ...
##  $ SPE_Numeric  : num  37 96 43.75 3.55 3.6 ...
##  $ NBME_Letter  : chr  "Honors" "Pass" "Honors" "Honors" ...
##  $ NBME_Numeric : num  11 78 18 87 67 2.1 75 11 82 12.1 ...
##  $ OSCE_Letter  : chr  "Honors" "High Honors" "Honors" "High Honors" ...
##  $ OSCE_Numeric : num  20 93 16.4 93 62 ...
##  $ Final_Letter2: Ord.factor w/ 6 levels "FAIL"<"INCOMPLETE"<..: 5 6 5 6 5 5 6 5 5 5 ...
##  $ SPE_Letter2  : Ord.factor w/ 6 levels "FAIL"<"INCOMPLETE"<..: 5 6 6 6 6 6 6 5 NA 6 ...
##  $ NBME_Letter2 : Ord.factor w/ 6 levels "FAIL"<"INCOMPLETE"<..: 5 4 5 5 4 4 5 5 5 NA ...
##  $ OSCE_Letter2 : Ord.factor w/ 6 levels "FAIL"<"INCOMPLETE"<..: 5 6 5 6 4 5 6 5 5 6 ...

Save as RDS

folder = "DataCleanAndFormatted"

# Check if the folder exists
if (!dir.exists(folder)) {
  # Create the folder
  dir.create(folder)
  saveRDS(ClerkshipGrades_clean,file.path(folder,"
                               ClerkshipGrades_formatted.RDS"))

} else {
  saveRDS(ClerkshipGrades_clean,file.path(folder,"ClerkshipGrades_formatted.RDS"))

}

THIRD DATASET - subset of crime data

#install.packages("jsonlite")
#library(jsonlite)
#endPoint="https://data.lacity.org/resource/2nrs-mtv8.json"
#LA_Crime_data = fromJSON(endPoint)
#View(LA_Crime_data)
#write.csv(LA_Crime_data, "../dataFiles/Crime_Data_subset.csv")